rm(list = ls())

library(readr)
library(dplyr)
library(plyr)

set.seed(987654321)

## this script recalculates weights for analyses

## multiple directories mostly for handling different size data sets
data_dir <- ""
data_dir2 <- ""
out_path <- ""
in_path <- "" #just a place for big data sets, not necessarily reading only
matched_file <- ""

#### voter demos
matched_data <- read_csv(
    matched_file,
    quote="", comment="", trim_ws=T,
    col_types=cols(
        .default=col_character()
    )
) %>%
    select(
        twProfileID,
        birth_year, race, party_affiliation,
        gender, state
    )


#### users
arabic_geo_tweets <- read.table(
    paste0(in_path, "arabic_usgeo_counts_20150901_to_20170213.txt.gz"),
    sep="\t", quote="\"", comment.char="", fill=T, header=F
)
names(arabic_geo_tweets) <- c(
    "date","userid","user_lang","lang","usa","state",
    "arabic_name","nonarabic_name","missing_coords","tweets"
)

arabic_geo_tweets <- subset(
    arabic_geo_tweets,
    usa=="True" & arabic_name=="True" & missing_coords=="False" & state!=""
)

arabic_geo_ids <- unique(arabic_geo_tweets$userid)

matched_sample_weights <- read.csv(
    paste0(out_path, "arabic_voters_comparison_weights.csv")
)
matched_sample_ids <- unlist(sample(
    matched_sample_weights$twProfileID, size=50000,
    prob=matched_sample_weights$approx_weight / sum(matched_sample_weights$approx_weight)
))



## users + voter demos
matched_data_sub <- subset(
    matched_data,
    twProfileID %in% unique(c(matched_sample_ids, arabic_geo_ids))
) %>% mutate(
    arabic_sample = as.integer(twProfileID %in% arabic_geo_ids),
    matched = twProfileID %in% matched_sample_ids
)

## this is the weighting command
agg_df <- matched_data_sub %>%
    mutate(
        birth_year_group = cut(
            as.numeric(birth_year),
            breaks=seq(1900, 2000, 5),
            right=F
        )
    ) %>%
    group_by(
        state,
        gender, party_affiliation,
        birth_year_group
    ) %>%
        dplyr::summarise(
            sum_arabic_sample = sum(arabic_sample),
            sum_matched = sum(twProfileID %in% matched_sample_ids)
        ) %>% ungroup() %>%
            dplyr::mutate(
            matched_weight = sum_arabic_sample / sum_matched * (sum(sum_matched) / sum(sum_arabic_sample))
            )


matched_data_with_weights <- left_join(
    matched_data_sub,
    agg_df,
    by=names(agg_df)[!(names(agg_df) %in% c("sum_arabic_sample","sum_matched","matched_weight"))]
)


matched_data_with_weights_sub <- matched_data_with_weights[,c(
    "twProfileID","sum_arabic_sample","sum_matched","matched_weight"
)]


#### tweets
thegrep <- "source_with"

i <- 1

thefiles <- grep(thegrep[i], list.files(data_dir2), value=T)

thefilter <- function(x, pos) subset(
    x,
    !is.na(coordinates)
)

input_data_list <- list()
for (thefile in thefiles) {
    cat("\n",thefile,":", which(thefiles %in% thefile), "out of",length(thefiles),"\n")
    input_data_list[[thefile]] <- read_delim_chunked(
        file.path(data_dir2,thefile),
        delim="\t", quote="\"", comment="",
        col_types=cols(
            .default=col_character()
        ),
        callback = DataFrameCallback$new(thefilter),
        chunk_size = 100000## ,
        ## n_max=10
    )
    cat("    # rows:", nrow(input_data_list[[thefile]]), "\n")
}
input_data_prep <- ldply(input_data_list, data.frame)


matched_sources <- subset(input_data_prep %>% mutate(
    arabic_name = id %in% arabic_geo_ids,
    matched = id %in% matched_sample_ids
), arabic_name | matched)

## users + tweets
matched_sources <- merge(
    matched_sources,
    matched_data_with_weights_sub,
    all.x=T,
    by.x="id", by.y="twProfileID"
)

matched_sources <- matched_sources %>% mutate(
    arabic_sample = as.integer(id %in% arabic_geo_ids) #same as arabic name
)

save(
    matched_sources,matched_data_with_weights_sub,
    file=paste0(
        in_path,
        "arabic_sample_and_comparison_usgeo_text_20150901_to_20170213_sources.RData"
    )
)



## all voters with Arabic sounding names
subset(
    matched_data,
    arabic_name & !is.na(birth_year)
) %>%
    group_by(twProfileID, age, race, state, gender, party_affiliation) %>%
    dplyr::summarise(arabic_name = mean(arabic_name)) %>% ungroup() %>%
    dplyr::summarise(mean_age = mean(age, na.rm=T), sd_age = sd(age, na.rm=T))

agged <- subset(
    matched_data,
    arabic_name & !is.na(birth_year)
) %>%
    group_by(twProfileID, age, race, state, gender, party_affiliation) %>%
    dplyr::summarise(arabic_name = mean(arabic_name)) %>% ungroup()
agged_tables <- with(
    agged,
    list(
        table_race = sort(prop.table(table(race))),
        table_state = sort(prop.table(table(state))),
        table_gender = prop.table(table(gender)),
        table_party_affiliation = prop.table(table(party_affiliation))
    )
)


## geotagging voters with Arabic sounding names
length(
    unique(
        subset(
            matched_sources,
            date >= "2015-11-01" & date < "2016-01-01"
            & arabic_name & id %in% matched_data$twProfileID
        )$id
    )
)

arabic_age <- subset(
    matched_sources,
    date >= "2015-11-01" & date < "2016-01-01" & arabic_name & registered==1 & !is.na(birth_year)) %>%
    group_by(id, age, race, state, gender, party_affiliation) %>%
    dplyr::summarise(arabic_name = mean(arabic_name)) %>% ungroup() %>%
    dplyr::summarise(mean_age = mean(age, na.rm=T), sd_age = sd(age, na.rm=T))

agged_geo <- subset(
    matched_sources,
    date >= "2015-11-01" & date < "2016-01-01" & arabic_name & registered==1 & !is.na(birth_year)) %>%
    group_by(id, age, race, state, gender, party_affiliation) %>%
    dplyr::summarise(arabic_name = mean(arabic_name)) %>% ungroup()

agged_geo_tables <- with(
    agged_geo,
    list(
        table_race = sort(prop.table(table(race))),
        table_state = sort(prop.table(table(state))),
        table_gender = prop.table(table(gender)),
        table_party_affiliation = prop.table(table(party_affiliation))
    )
)
